# Importing all the required libraries
library(tidyverse)
library(ggplot2)
library(plotly)
library(ipumsr)
library(haven)
library(splitstackshape)
library(caret)
library(randomForest)
# A. Loading the data and creating a dataframe
df_ddi <- read_ipums_ddi("Dataset/nhis_00001.xml")
df <- as.data.frame(read_ipums_micro(df_ddi, verbose = FALSE))
df %>% head(5)
## YEAR SERIAL STRATA PSU NHISHID REGION URBRRL PERNUM NHISPID
## 1 2020 1 150 25 0002020H000002 3 2 1 0002020H00000210
## 2 2020 2 111 10 0002020H000003 4 3 1 0002020H00000310
## 3 2020 3 133 3 0002020H000004 2 2 1 0002020H00000410
## 4 2020 4 139 45 0002020H000007 3 3 1 0002020H00000710
## 5 2020 5 130 21 0002020H000009 3 1 1 0002020H00000910
## HHX SAMPWEIGHT LONGWEIGHT PARTWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN
## 1 H000002 5946.002 17605.50 0.000 1 0 56 2 2
## 2 H000003 6288.726 0.00 5853.664 1 0 66 1 2
## 3 H000004 6083.271 0.00 5814.397 1 0 59 2 2
## 4 H000007 11306.962 0.00 10626.550 1 0 56 1 2
## 5 H000009 6471.818 19317.18 0.000 1 0 48 2 2
## MARSTAT MARST FAMSIZE PARTNEREMP FAMKIDNO RACEA ARMFEV EDUC SPOUSEDUC
## 1 10 11 3 0 1 100 11 501 3
## 2 10 11 2 0 0 100 11 501 10
## 3 10 11 2 0 0 100 11 502 9
## 4 30 30 1 0 0 100 11 103 0
## 5 99 99 4 0 1 100 98 400 0
## SCHOOLNOW EMPSTAT HOURSWRK PAIDSICK EMPHI EMPFT INCFAM07ON FAMTOTINC USUALPL
## 1 1 200 0 0 0 0 11 27000 2
## 2 1 100 40 2 2 2 24 250000 3
## 3 1 100 40 1 1 2 24 150000 2
## 4 1 999 0 0 0 0 22 54000 2
## 5 8 999 0 0 0 0 23 95000 2
## HINOTCOVE CVDDIAG CVDTEST CVDTESTRSLT
## 1 1 1 1 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 2 1 1 0
# B. Preliminary check on the data
# a. Review classes
sapply(df, class)
## $YEAR
## [1] "numeric"
##
## $SERIAL
## [1] "numeric"
##
## $STRATA
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $PSU
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $NHISHID
## [1] "character"
##
## $REGION
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $URBRRL
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $PERNUM
## [1] "numeric"
##
## $NHISPID
## [1] "character"
##
## $HHX
## [1] "character"
##
## $SAMPWEIGHT
## [1] "numeric"
##
## $LONGWEIGHT
## [1] "numeric"
##
## $PARTWEIGHT
## [1] "numeric"
##
## $ASTATFLG
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CSTATFLG
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $AGE
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $SEX
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $SEXORIEN
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $MARSTAT
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $MARST
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $FAMSIZE
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $PARTNEREMP
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $FAMKIDNO
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $RACEA
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $ARMFEV
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EDUC
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $SPOUSEDUC
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $SCHOOLNOW
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EMPSTAT
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $HOURSWRK
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $PAIDSICK
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EMPHI
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $EMPFT
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $INCFAM07ON
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $FAMTOTINC
## [1] "haven_labelled" "vctrs_vctr" "double"
##
## $USUALPL
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $HINOTCOVE
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CVDDIAG
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CVDTEST
## [1] "haven_labelled" "vctrs_vctr" "integer"
##
## $CVDTESTRSLT
## [1] "haven_labelled" "vctrs_vctr" "integer"
# Note: some of the variables have attached data definitions "haven labeled"
# b. Check for missing values
sapply(df, function(x) sum(is.na(x)))
## YEAR SERIAL STRATA PSU NHISHID REGION
## 0 0 0 0 0 0
## URBRRL PERNUM NHISPID HHX SAMPWEIGHT LONGWEIGHT
## 0 0 0 0 0 0
## PARTWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN
## 0 0 0 0 0 0
## MARSTAT MARST FAMSIZE PARTNEREMP FAMKIDNO RACEA
## 0 0 0 0 0 0
## ARMFEV EDUC SPOUSEDUC SCHOOLNOW EMPSTAT HOURSWRK
## 0 0 0 0 0 0
## PAIDSICK EMPHI EMPFT INCFAM07ON FAMTOTINC USUALPL
## 0 0 0 0 0 0
## HINOTCOVE CVDDIAG CVDTEST CVDTESTRSLT
## 0 0 0 0
#c. Dropping variables that are not that important or provide no inisghts in the data
droppable_cols <- c("YEAR","SERIAL","STRATA","PSU","NHISHID","NHISPID","HHX","SAMPWEIGHT","LONGWEIGHT","PARTWEIGHT")
df <- df %>% dplyr::select(-all_of(droppable_cols))
colnames(df)
## [1] "REGION" "URBRRL" "PERNUM" "ASTATFLG" "CSTATFLG"
## [6] "AGE" "SEX" "SEXORIEN" "MARSTAT" "MARST"
## [11] "FAMSIZE" "PARTNEREMP" "FAMKIDNO" "RACEA" "ARMFEV"
## [16] "EDUC" "SPOUSEDUC" "SCHOOLNOW" "EMPSTAT" "HOURSWRK"
## [21] "PAIDSICK" "EMPHI" "EMPFT" "INCFAM07ON" "FAMTOTINC"
## [26] "USUALPL" "HINOTCOVE" "CVDDIAG" "CVDTEST" "CVDTESTRSLT"
# Feature Engineering variables
# REGION, URBRRL PERNUM ASTATFLG CSTATFLG AGE FAMKIDNO dont need any feature engineering yet
# a. Handling SEX AND SEXORIEN variable
# Combining all 'unknown categories' into one
unk <- c(7,8,9)
df$SEX[df$SEX %in% unk] = 9
# Combining Unknown SEXORIEN into "Something else" category
unk <- c(5,7,8)
df$SEXORIEN[df$SEXORIEN %in% unk] = 4
# b. Handling RACEA Variable
# Combining all 'unknown categories' into one
unk <- c(580, 900, 970, 980, 990)
df$RACEA[df$RACEA %in% unk] = 900
#c. Handling MARSTAT & MARST Variable
# Since both of the variables are indicative of same features it makes sense to keep on their current marital status for this analysis.
df <- df %>% select(-MARSTAT)
# Combining NIU and Unknown into one and combining all married labels into one
unk <- c(0,99)
df$MARST[df$MARST %in% unk] = 99
marr <- c(10,11,12,13)
df$MARST[df$MARST %in% marr] = 10
# d. Handling FAMSIZE
# Combine Unknowns into one
unk <- c(98,99)
df$FAMSIZE[df$FAMSIZE %in% unk] = 98
# e. Handling PARTNEREMP
unk <- c(7,8,9)
df$PARTNEREMP[df$PARTNEREMP %in% unk] = 9
# f. Handling ARMFEV by combining all unknowns
unk <- c(97,98,99)
df$ARMFEV[df$ARMFEV %in% unk] = 99
# g. Handling EDUC, SPOUSEDUC, SCHOOLNOW, by combining all unknowns
unk <- c(996,997,998,999)
df$EDUC[df$EDUC %in% unk] = 996
unk <- c(97,98,99)
df$SPOUSEDUC[df$SPOUSEDUC %in% unk] = 99
unk <- c(7,8,9)
df$SCHOOLNOW[df$SCHOOLNOW %in% unk] = 9
#h. Handling Employment Status
# Table below shows the distribution of employment categories
#Labels:
#value label
# 0 NIU
# 100 Employed
# 110 Working
# 111 Working for pay at job/business
# 112 Working, w/out pay, at job/business
# 120 With job, but not at work
# 121 With job, not at work: not laid-off, not looking
# 122 With job, not at work: looking
# 200 Not employed
# 210 Unemployed
# 211 Unemployed: On layoff
# 212 Unemployed: On layoff and looking
# 213 Unemployed: Unk if looking or laid off
# 214 Unemployed: Looking or on layoff
# 215 Unemployed: Have job to return to
# 216 Unemployed: Had job during the round
# 217 Unemployed: No job during reference period
# 220 Not in labor force
# 900 Unknown-all causes
# 997 Unknown-refused
# 998 Unknown-not ascertained
# 999 Unknown-don't know
# Combining all working into one, with job into one, Unemployed into one and unknown into one
work <- c(110,111,112)
df$EMPSTAT[df$EMPSTAT %in% work] = 110
w_job <- c(120,121,122)
df$EMPSTAT[df$EMPSTAT %in% w_job] = 120
unemployed <- c(200,210,211:217)
df$EMPSTAT[df$EMPSTAT %in% unemployed] = 200
unk <- c(997:999)
df$EMPSTAT[df$EMPSTAT %in% unk] = 999
# i. Handling HOURSWRK by replacing number of hours unknown into 0
unk <- c(0,97:99)
df$HOURSWRK[df$HOURSWRK %in% unk] = 0
# j. Handling all variables remaining
# Combined unknowns into one
df <- df %>% mutate(PAIDSICK = replace(PAIDSICK,PAIDSICK >4,9))
# Mutating EMPHI
df <- df %>% mutate(EMPHI = replace(EMPHI,EMPHI >4,9))
# Mutating USUALPL
# value label
# 0 NIU
# 1 There is no place or No
# 2 Yes, has a usual place or Yes
# 3 There is more than one place
# 7 Unknown-refused
# 8 Unknown-not ascertained
# 9 Unknown-don't know
# 2-3 combined as 2 and 7,8,9 combined as 3
df <- df %>%
mutate(USUALPL = replace(USUALPL, USUALPL == 3,2))
df <- df %>%
mutate(USUALPL = replace(USUALPL, USUALPL >= 7,9))
# Mutating HINOTCOVE
# Combine all unknowns into one
df <- df %>% mutate(HINOTCOVE = replace(HINOTCOVE,HINOTCOVE >4,9))
# Mutating CVDTEST
# Same transformation as above
df <- df %>% mutate(CVDTEST = replace(CVDTEST,CVDTEST >4,9))
# Mutating CVDDIAG
# Same transformation as above
df <- df %>% mutate(CVDDIAG = replace(CVDDIAG,CVDDIAG >4,9))
# Mutating CVDTESTRSLTS
#Labels:
# value label
# 0 NIU
# 1 No
# 2 Yes
# 3 Did not receive results
# 7 Unknown-refused
# 8 Unknown-not ascertained
# 9 Unknown-don't know
#Similar transformation where we combine all unknowns to 9
df <- df %>% mutate(CVDTESTRSLT = replace(CVDTESTRSLT,CVDTESTRSLT >4,9))
# A. Checking for health insurance coverage and covid relation
counts_hinc <- table(df$HINOTCOVE, df$CVDTESTRSLT)
rownames(counts_hinc) <- c('Has coverage','No coverage','Unknown')
colnames(counts_hinc) <- c('NIU', 'Negative','Positive',"Did Not Receive results",'Unknown')
barplot(counts_hinc[,2:3], col = rainbow(3:6),
main = 'Test Results based on Insurance Coverage', legend = rownames(counts_hinc)[1:2], xlab = 'COVID Test Result')
# B. Seeing the number of people who got tested given they have health insurance coverage from the company
sample_df <- df %>% group_by(CVDTEST = as.factor(CVDTEST),EMPHI = as.factor(EMPHI)) %>% dplyr::summarize(count = n())
## `summarise()` has grouped output by 'CVDTEST'. You can override using the `.groups` argument.
ggplot(sample_df, aes(fill=CVDTEST, y=count, x=CVDTEST)) +
geom_bar(position="dodge", stat="identity") +
ggtitle("People who tested when they get Health insurance coverage from the Company") +
facet_wrap(~EMPHI, ncol = 4, labeller = labeller(CVDTEST = c("0" = "NIU","1" = "Not Covered by Company", "2" = "Covered by Company", "9" = "Unknown" ))) +
theme(legend.position="none") +
xlab("Covid Test") + ylab("Frequency") + scale_x_discrete(labels = c("NIU","Negative","Positive","No Results")) + scale_fill_manual(values = c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C")) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 30))
# C. Seeing results between getting sick leave and covid result tests
counts_ps <- table(df$PAIDSICK, df$CVDTESTRSLT)
rownames(counts_ps) <- c('NIU','No sick leave','Sick leave', 'Unknown')
colnames(counts_ps) <- c('NIU', 'Negative','Positive', "Did Not Receive Results",'Unknown')
barplot(counts_ps[2:3,2:3], col = rainbow(2),
legend = rownames(counts_ps)[2:3], main = 'Test Results based on having Sick Leave', xlab = 'Testing Opportunity')
# D. Does having a usual place for medical care affect the testing
sample_df <- df %>% filter(USUALPL == 2)
sample_df <- sample_df %>% group_by(CVDTEST) %>% summarise(n = n())
sample_df %>% head()
## # A tibble: 4 x 2
## CVDTEST n
## <int+lbl> <int>
## 1 0 [NIU] 16418
## 2 1 [No] 12956
## 3 2 [Yes] 5236
## 4 9 [Unknown-don't know] 33
fig <- plot_ly(sample_df, x = ~CVDTEST, y = ~n, type = 'bar',
marker = list(color = c('rgba(204,204,204,1)', 'rgba(204,204,204,1)', 'rgba(222,45,38,0.8)','rgba(204,204,204,1)')))
fig <- fig %>% layout(title = "Seeing if having Usual Place of medical care encourages Covid Testing",
xaxis = list(title = ""),
yaxis = list(title = ""))
fig
How does socioeconomic status affect COVID-19 infection status?
# A. Creating Stratified Sample
table(df$CVDDIAG)
##
## 0 1 2 9
## 17649 18954 671 84
sample_df <- df %>% filter(CVDDIAG !=0)
sample_df <- sample_df %>% filter(CVDDIAG !=9)
strat <- stratified(sample_df, group = 'CVDDIAG', size = 2000)
## Groups 2 contain fewer rows than requested. Returning all rows.
# Dropping the NIUs & Unknown as the provide no insights into the data
strat$CVDDIAG[strat$CVDDIAG == 1] = 0
strat$CVDDIAG[strat$CVDDIAG == 2] = 1
# Seeing the distribution of the dataset
table(strat$CVDDIAG)/length(strat$CVDDIAG)
##
## 0 1
## 0.7487832 0.2512168
# Visualising it
barplot(table(strat$CVDDIAG))
# Splitting the data into training and testing
set.seed(101)
idx = sample.int(n = nrow(strat), size = floor(0.70 * nrow(strat)), replace = F)
s.train <- strat[idx,]
s.test <- strat[-idx,]
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
# Selecting personal information variables
per_info_vars <- c("AGE", "SEX", "MARST", "RACEA", "SEXORIEN", "EDUC", "ASTATFLG", "CSTATFLG","EDUC","REGION","CVDDIAG")
data_to_study.train <- s.train %>% dplyr::select(per_info_vars)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(per_info_vars)` instead of `per_info_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data_to_study.test <- s.test %>% dplyr::select(per_info_vars)
# Fitting model using stepAIC to find the optimal combination of variables
model <- glm(data = data_to_study.train, CVDDIAG ~ ., family = 'binomial') %>% stepAIC(trace = FALSE, k=2)
# Printing the summary
summary(model)
##
## Call:
## glm(formula = CVDDIAG ~ AGE + RACEA + ASTATFLG, family = "binomial",
## data = data_to_study.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1330 -0.7770 -0.6897 -0.4923 4.8696
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9427903 0.2391516 -8.124 4.52e-16 ***
## AGE -0.0113351 0.0031403 -3.610 0.000307 ***
## RACEA 0.0008603 0.0002546 3.379 0.000727 ***
## ASTATFLG 1.3012420 0.2700237 4.819 1.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2079.2 on 1868 degrees of freedom
## Residual deviance: 2034.4 on 1865 degrees of freedom
## AIC: 2042.4
##
## Number of Fisher Scoring iterations: 5
# Checking for the confusion matrix
lr_probs <- predict(model, newdata = data_to_study.test, type = 'response')
lr_predicted <- ifelse(lr_probs < 0.5, 0, 1)
confusionMatrix(factor(lr_predicted, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)), factor(data_to_study.test$CVDDIAG, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 588 214
## 1 0 0
##
## Accuracy : 0.7332
## 95% CI : (0.7011, 0.7635)
## No Information Rate : 0.7332
## P-Value [Acc > NIR] : 0.5184
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7332
## Neg Pred Value : NaN
## Prevalence : 0.7332
## Detection Rate : 0.7332
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
plot(model)
m1 <- glm(data = data_to_study.train, CVDDIAG ~ AGE + RACEA + SEX + SEXORIEN + MARST + REGION + EDUC, family = 'binomial')
summary(m1)
##
## Call:
## glm(formula = CVDDIAG ~ AGE + RACEA + SEX + SEXORIEN + MARST +
## REGION + EDUC, family = "binomial", data = data_to_study.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3057 -0.7649 -0.6875 -0.5501 4.4858
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6350381 0.3490563 -4.684 2.81e-06 ***
## AGE -0.0093570 0.0032519 -2.877 0.004010 **
## RACEA 0.0009083 0.0002589 3.508 0.000452 ***
## SEX 0.0933984 0.1099770 0.849 0.395740
## SEXORIEN 0.3217312 0.0951322 3.382 0.000720 ***
## MARST -0.0018908 0.0024325 -0.777 0.436985
## REGION -0.0083507 0.0530949 -0.157 0.875025
## EDUC 0.0003948 0.0004548 0.868 0.385434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2079.2 on 1868 degrees of freedom
## Residual deviance: 2039.3 on 1861 degrees of freedom
## AIC: 2055.3
##
## Number of Fisher Scoring iterations: 5
# Checking for the confusion matrix
lr_probs <- predict(m1, newdata = data_to_study.test, type = 'response')
lr_predicted <- ifelse(lr_probs < 0.5, 0, 1)
cf <- confusionMatrix(factor(lr_predicted, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)), factor(data_to_study.test$CVDDIAG, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)))
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
conf.mat <- as.data.frame.matrix(cf$table)
stargazer(conf.mat, title = "Confusion Matrix", summary = FALSE)
##
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: Wed, Dec 08, 2021 - 17:59:02
## \begin{table}[!htbp] \centering
## \caption{Confusion Matrix}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}} ccc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## & 0 & 1 \\
## \hline \\[-1.8ex]
## 0 & $587$ & $213$ \\
## 1 & $1$ & $1$ \\
## \hline \\[-1.8ex]
## \end{tabular}
## \end{table}
What are the driving factors behind infections?
# A. Using logistic regression to see what affects the diagnosis and selecting the best model using stepAIC
library(randomForestExplainer)
## Warning: package 'randomForestExplainer' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Creating samples
x.train <- s.train %>% dplyr::select(-CVDDIAG)
y.train <- s.train %>% dplyr::select(CVDDIAG)
x.test <- s.test %>% dplyr::select(-CVDDIAG)
y.test <- s.test %>% dplyr::select(CVDDIAG)
model.rf <- randomForest::randomForest(x = x.train, y = as.factor(y.train$CVDDIAG), ntree = 500, importance = T, proximity = T)
print(model.rf)
##
## Call:
## randomForest(x = x.train, y = as.factor(y.train$CVDDIAG), ntree = 500, importance = T, proximity = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 7.44%
## Confusion matrix:
## 0 1 class.error
## 0 1396 16 0.01133144
## 1 123 334 0.26914661
print(importance(model.rf,2))
## MeanDecreaseGini
## REGION 15.1776486
## URBRRL 15.5461451
## PERNUM 0.4920045
## ASTATFLG 0.2890717
## CSTATFLG 0.4990005
## AGE 39.5814821
## SEX 6.7601661
## SEXORIEN 7.3789954
## MARST 10.9652030
## FAMSIZE 14.4032490
## PARTNEREMP 3.5806953
## FAMKIDNO 9.5932683
## RACEA 11.7607230
## ARMFEV 4.6114876
## EDUC 19.6018951
## SPOUSEDUC 12.6240833
## SCHOOLNOW 4.1628201
## EMPSTAT 2.7761303
## HOURSWRK 20.6186549
## PAIDSICK 6.1470726
## EMPHI 6.2977804
## EMPFT 4.6548641
## INCFAM07ON 12.7385126
## FAMTOTINC 38.5097475
## USUALPL 2.4717670
## HINOTCOVE 3.0837250
## CVDTEST 86.6101415
## CVDTESTRSLT 310.7068821
plot(model.rf)
predictions <- predict(model.rf, newdata = x.test)
confusionMatrix(predictions,droplevels(as.factor(y.test$CVDDIAG)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 580 56
## 1 8 158
##
## Accuracy : 0.9202
## 95% CI : (0.8992, 0.938)
## No Information Rate : 0.7332
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7804
##
## Mcnemar's Test P-Value : 4.228e-09
##
## Sensitivity : 0.9864
## Specificity : 0.7383
## Pos Pred Value : 0.9119
## Neg Pred Value : 0.9518
## Prevalence : 0.7332
## Detection Rate : 0.7232
## Detection Prevalence : 0.7930
## Balanced Accuracy : 0.8624
##
## 'Positive' Class : 0
##
frame <- measure_importance(model.rf)
frame
## variable mean_min_depth no_of_nodes accuracy_decrease gini_decrease
## 1 AGE 2.58000 11141 1.176346e-02 39.5814821
## 2 ARMFEV 5.25932 2004 6.923569e-04 4.6114876
## 3 ASTATFLG 10.13896 149 2.853428e-04 0.2890717
## 4 CSTATFLG 9.93798 157 2.608123e-04 0.4990005
## 5 CVDTEST 2.68674 2157 4.987778e-02 86.6101415
## 6 CVDTESTRSLT 1.83000 5205 1.667199e-01 310.7068821
## 7 EDUC 3.59200 7439 5.184685e-03 19.6018951
## 8 EMPFT 5.82216 1590 4.602736e-03 4.6548641
## 9 EMPHI 5.21142 1888 4.163632e-03 6.2977804
## 10 EMPSTAT 6.20004 1035 1.828076e-03 2.7761303
## 11 FAMKIDNO 4.26200 3830 6.560714e-03 9.5932683
## 12 FAMSIZE 3.66200 5969 6.278135e-03 14.4032490
## 13 FAMTOTINC 2.81600 11199 9.298352e-03 38.5097475
## 14 HINOTCOVE 6.35080 1256 3.905990e-04 3.0837250
## 15 HOURSWRK 3.15200 5547 1.014233e-02 20.6186549
## 16 INCFAM07ON 4.13200 5176 4.226255e-03 12.7385126
## 17 MARST 4.16000 4580 5.607950e-03 10.9652030
## 18 PAIDSICK 5.17654 2052 3.661607e-03 6.1470726
## 19 PARTNEREMP 6.27538 1257 2.188198e-04 3.5806953
## 20 PERNUM 9.42906 282 6.436061e-04 0.4920045
## 21 RACEA 3.65400 3713 1.653747e-03 11.7607230
## 22 REGION 3.86200 6369 -8.313457e-04 15.1776486
## 23 SCHOOLNOW 5.21952 1543 1.332205e-03 4.1628201
## 24 SEX 4.99322 3477 6.147641e-05 6.7601661
## 25 SEXORIEN 4.10322 2220 2.951308e-03 7.3789954
## 26 SPOUSEDUC 4.26600 4781 3.998279e-03 12.6240833
## 27 URBRRL 3.55600 6247 1.933427e-03 15.5461451
## 28 USUALPL 7.11012 1041 5.671243e-05 2.4717670
## no_of_trees times_a_root p_value
## 1 500 33 0.000000e+00
## 2 494 5 1.000000e+00
## 3 132 5 1.000000e+00
## 4 141 6 1.000000e+00
## 5 483 69 1.000000e+00
## 6 500 83 3.141836e-127
## 7 500 6 0.000000e+00
## 8 472 31 1.000000e+00
## 9 489 49 1.000000e+00
## 10 418 18 1.000000e+00
## 11 500 0 9.723098e-03
## 12 500 2 3.471439e-271
## 13 500 5 0.000000e+00
## 14 460 3 1.000000e+00
## 15 500 69 9.891712e-186
## 16 500 0 9.879519e-123
## 17 500 9 2.623109e-47
## 18 493 44 1.000000e+00
## 19 471 1 1.000000e+00
## 20 227 1 1.000000e+00
## 21 500 22 3.486345e-01
## 22 500 1 0.000000e+00
## 23 484 12 1.000000e+00
## 24 499 0 9.998416e-01
## 25 499 15 1.000000e+00
## 26 500 3 5.066819e-69
## 27 500 8 0.000000e+00
## 28 454 0 1.000000e+00
plot_importance_ggpairs(frame)
plot_importance_rankings(frame)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
imp <- as.data.frame(model.rf$importance[,3:4])
imp <- imp %>% arrange(desc(MeanDecreaseGini))
ggplot(imp, aes(x = reorder(rownames(imp), MeanDecreaseGini), y = MeanDecreaseGini, fill = rownames(imp))) + geom_bar(stat = "identity") + ggtitle("Variable Importance based on MeanDecreaseGini") + theme_minimal() + coord_flip() + xlab("Variables") + ylab("Importance of the variables")
What type of working conditions impacts the ability to get tested for COVID?